Still under constructions.

(I) Background

  • Instructor: Peng Wang, AVP, Head of Data Science - Operation & Fraud Detection at MassMutual Financial Group (Fall 2016).
  • Goal: Get the latest top 250 movies’ detailed data from IMDb. Link
  • In this project, terms that need to be collected include Official Poster, Rank, Link, Title, Year, Content Rating, User Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross. However, some of the terms might not be provided by the website.
Name of Terms Name of Variables
Official Poster movie_poster
Rank movie_rank
Link movie_link
Title movie_title
Year movie_year
Content Rating movie_content_rating
User Rating movie_user_rating
Number of Rater movie_num_rater
Genre movie_genre
Budget ($) movie_budget
Opening Weekend USA ($) movie_opening
Gross USA ($) movie_gross
Cumulative Worldwide Gross ($) movie_worldwide_gross

(II) General List

  • Read and load each line of source code of top 250 movies from IMDb.
  • Use regular expression to retrieve and extract Official Poster, Rank, Link, Title, Year, and User Rating.
  • The data was collected on 2020-10-30.
# Read source code from the main page
main_url = "http://www.imdb.com/chart/top?ref_=ft_250"

main_source_code = main_url %>%
  readLines(encoding = "UTF-8")
# Get each movie's poster, rank, link, title, year and user rating
# Locator: <td class="titleColumn">
#   line - 2: poster
#   line + 1: rank
#   line + 2: link
#   line + 3: title
#   line + 4: year
#   line + 7: user rating
h_get_main_movie_poster = function(input_main_source_code, input_main_locator_pattern) {
  main_locator_pattern_lines = input_main_source_code %>%
    grep(pattern = input_main_locator_pattern)
  
  movie_poster = input_main_source_code %>%
    extract(main_locator_pattern_lines - 2) %>%
    strsplit("> ") %>% sapply(extract, 2) %>%
    paste0("</img>")
  return(movie_poster)
}

h_get_main_movie_poster_with_link = function(input_movie_poster) {
  raw_url = input_movie_poster %>%
    str_split("src=\"") %>% sapply(extract, 2) %>%
    str_split("\" width=") %>% sapply(extract, 1)
  movie_poster_big = paste0("[", input_movie_poster, "](", raw_url, ")")
  return(movie_poster_big)
}

h_get_main_movie_rank = function(input_main_source_code, input_main_locator_pattern) {
  main_locator_pattern_lines = input_main_source_code %>%
    grep(pattern = input_main_locator_pattern)
  
  movie_rank = input_main_source_code %>%
    extract(main_locator_pattern_lines + 1) %>%
    str_remove_all(" ") %>%
    str_remove("\\.")
  return(movie_rank)
}

h_get_main_movie_link = function(input_main_source_code, input_main_locator_pattern) {
  main_locator_pattern_lines = input_main_source_code %>%
    grep(pattern = input_main_locator_pattern)
  
  movie_link = input_main_source_code %>%
    extract(main_locator_pattern_lines + 2) %>%
    str_split("href=\"") %>% sapply(extract, 2) %>%
    strsplit("\\?") %>% sapply(extract, 1) %>%
    paste0("https://www.imdb.com", .)
  return(movie_link)
}

h_get_main_movie_title = function(input_main_source_code, input_main_locator_pattern) {
  main_locator_pattern_lines = input_main_source_code %>%
    grep(pattern = input_main_locator_pattern)
  
  movie_title = input_main_source_code %>%
    extract(main_locator_pattern_lines + 3) %>%
    strsplit(">") %>% sapply(extract, 2) %>%
    strsplit("<") %>% sapply(extract, 1)
  return(movie_title)
}

h_get_main_movie_year = function(input_main_source_code, input_main_locator_pattern) {
  main_locator_pattern_lines = input_main_source_code %>%
    grep(pattern = input_main_locator_pattern)
  
  movie_year = input_main_source_code %>%
    extract(main_locator_pattern_lines + 4) %>%
    strsplit("\\(") %>% sapply(extract, 2) %>%
    strsplit("\\)") %>% sapply(extract, 1)
  return(movie_year)
}

h_get_main_movie_user_rating = function(input_main_source_code, input_main_locator_pattern) {
  main_locator_pattern_lines = input_main_source_code %>%
    grep(pattern = input_main_locator_pattern)
  
  movie_user_rating = input_main_source_code %>%
    extract(main_locator_pattern_lines + 7) %>%
    strsplit(">") %>% sapply(extract, 2) %>%
    strsplit("<") %>% sapply(extract, 1)
  return(movie_user_rating)
}

h_get_main_movie_title_with_link = function(input_movie_title, input_movie_link) {
  movie_title_with_link = paste0("[", input_movie_title, "](", input_movie_link, ")")
  return(movie_title_with_link)
}
# Get data
main_locator_pattern = "<td class=\"titleColumn\">"

m_poster = h_get_main_movie_poster(main_source_code, main_locator_pattern)
m_poster_with_link = h_get_main_movie_poster_with_link(m_poster)
m_rank = h_get_main_movie_rank(main_source_code, main_locator_pattern)
m_title = h_get_main_movie_title(main_source_code, main_locator_pattern)
m_link = h_get_main_movie_link(main_source_code, main_locator_pattern)
m_year = h_get_main_movie_year(main_source_code, main_locator_pattern)
m_user_rating = h_get_main_movie_user_rating(main_source_code, main_locator_pattern)
m_title_with_link = h_get_main_movie_title_with_link(m_title, m_link)
# Visualization
main_page = tibble(`Official Poster` = m_poster,
                   Rank = m_rank,
                   Title = m_title_with_link,
                   Year = m_year,
                   `User Rating` = m_user_rating)
main_page %>%
  kable(align = "c", escape = FALSE) %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                fixed_thead = TRUE,
                full_width = FALSE) %>%
  row_spec(0:250, extra_css = "vertical-align: middle;")
Official Poster Rank Title Year User Rating
The Shawshank Redemption 1 The Shawshank Redemption 1994 9.2
The Godfather 2 The Godfather 1972 9.1
The Godfather: Part II 3 The Godfather: Part II 1974 9.0
The Dark Knight 4 The Dark Knight 2008 9.0
12 Angry Men 5 12 Angry Men 1957 8.9
Schindler's List 6 Schindler’s List 1993 8.9
The Lord of the Rings: The Return of the King 7 The Lord of the Rings: The Return of the King 2003 8.9
Pulp Fiction 8 Pulp Fiction 1994 8.8
The Good, the Bad and the Ugly 9 The Good, the Bad and the Ugly 1966 8.8
The Lord of the Rings: The Fellowship of the Ring 10 The Lord of the Rings: The Fellowship of the Ring 2001 8.8
Fight Club 11 Fight Club 1999 8.8
Forrest Gump 12 Forrest Gump 1994 8.8
Inception 13 Inception 2010 8.7
The Lord of the Rings: The Two Towers 14 The Lord of the Rings: The Two Towers 2002 8.7
Star Wars: Episode V - The Empire Strikes Back 15 Star Wars: Episode V - The Empire Strikes Back 1980 8.7
The Matrix 16 The Matrix 1999 8.6
Goodfellas 17 Goodfellas 1990 8.6
One Flew Over the Cuckoo's Nest 18 One Flew Over the Cuckoo’s Nest 1975 8.6
Seven Samurai 19 Seven Samurai 1954 8.6
Se7en 20 Se7en 1995 8.6
Life Is Beautiful 21 Life Is Beautiful 1997 8.6
City of God 22 City of God 2002 8.6
The Silence of the Lambs 23 The Silence of the Lambs 1991 8.6
It's a Wonderful Life 24 It’s a Wonderful Life 1946 8.6
Star Wars: Episode IV - A New Hope 25 Star Wars: Episode IV - A New Hope 1977 8.6
Saving Private Ryan 26 Saving Private Ryan 1998 8.5
Spirited Away 27 Spirited Away 2001 8.5
The Green Mile 28 The Green Mile 1999 8.5
Parasite 29 Parasite 2019 8.5
Interstellar 30 Interstellar 2014 8.5
Léon: The Professional 31 Léon: The Professional 1994 8.5
The Usual Suspects 32 The Usual Suspects 1995 8.5
Harakiri 33 Harakiri 1962 8.5
The Lion King 34 The Lion King 1994 8.5
Back to the Future 35 Back to the Future 1985 8.5
The Pianist 36 The Pianist 2002 8.5
Terminator 2: Judgment Day 37 Terminator 2: Judgment Day 1991 8.5
American History X 38 American History X 1998 8.5
Modern Times 39 Modern Times 1936 8.5
Psycho 40 Psycho 1960 8.5
Gladiator 41 Gladiator 2000 8.5
City Lights 42 City Lights 1931 8.5
The Departed 43 The Departed 2006 8.5
The Intouchables 44 The Intouchables 2011 8.5
Whiplash 45 Whiplash 2014 8.5
Hamilton 46 Hamilton 2020 8.5
The Prestige 47 The Prestige 2006 8.5
Grave of the Fireflies 48 Grave of the Fireflies 1988 8.5
Once Upon a Time in the West 49 Once Upon a Time in the West 1968 8.4
Casablanca 50 Casablanca 1942 8.4
Cinema Paradiso 51 Cinema Paradiso 1988 8.4
Rear Window 52 Rear Window 1954 8.4
Alien 53 Alien 1979 8.4
Apocalypse Now 54 Apocalypse Now 1979 8.4
Memento 55 Memento 2000 8.4
The Great Dictator 56 The Great Dictator 1940 8.4
Raiders of the Lost Ark 57 Raiders of the Lost Ark 1981 8.4
Django Unchained 58 Django Unchained 2012 8.4
The Lives of Others 59 The Lives of Others 2006 8.4
Joker 60 Joker 2019 8.4
Paths of Glory 61 Paths of Glory 1957 8.4
WALL·E 62 WALL·E 2008 8.4
The Shining 63 The Shining 1980 8.4
Avengers: Infinity War 64 Avengers: Infinity War 2018 8.4
Sunset Blvd. 65 Sunset Blvd. 1950 8.4
Witness for the Prosecution 66 Witness for the Prosecution 1957 8.4
Spider-Man: Into the Spider-Verse 67 Spider-Man: Into the Spider-Verse 2018 8.4
Oldboy 68 Oldboy 2003 8.4
Princess Mononoke 69 Princess Mononoke 1997 8.4
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb 70 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb 1964 8.3
The Dark Knight Rises 71 The Dark Knight Rises 2012 8.3
Once Upon a Time in America 72 Once Upon a Time in America 1984 8.3
Aliens 73 Aliens 1986 8.3
Your Name. 74 Your Name. 2016 8.3
Avengers: Endgame 75 Avengers: Endgame 2019 8.3
Coco 76 Coco 2017 8.3
American Beauty 77 American Beauty 1999 8.3
Braveheart 78 Braveheart 1995 8.3
3 Idiots 79 3 Idiots 2009 8.3
Das Boot 80 Das Boot 1981 8.3
Toy Story 81 Toy Story 1995 8.3
High and Low 82 High and Low 1963 8.3
Amadeus 83 Amadeus 1984 8.3
Capharnaüm 84 Capharnaüm 2018 8.3
Taare Zameen Par 85 Taare Zameen Par 2007 8.3
Inglourious Basterds 86 Inglourious Basterds 2009 8.3
Star Wars: Episode VI - Return of the Jedi 87 Star Wars: Episode VI - Return of the Jedi 1983 8.3
Reservoir Dogs 88 Reservoir Dogs 1992 8.3
Good Will Hunting 89 Good Will Hunting 1997 8.3
2001: A Space Odyssey 90 2001: A Space Odyssey 1968 8.3
Requiem for a Dream 91 Requiem for a Dream 2000 8.3
Vertigo 92 Vertigo 1958 8.3
M 93 M 1931 8.3
Eternal Sunshine of the Spotless Mind 94 Eternal Sunshine of the Spotless Mind 2004 8.3
Dangal 95 Dangal 2016 8.3
The Hunt 96 The Hunt 2012 8.3
Citizen Kane 97 Citizen Kane 1941 8.3
1917 98 1917 2019 8.3
Full Metal Jacket 99 Full Metal Jacket 1987 8.2
Bicycle Thieves 100 Bicycle Thieves 1948 8.2
The Kid 101 The Kid 1921 8.2
Singin' in the Rain 102 Singin’ in the Rain 1952 8.2
A Clockwork Orange 103 A Clockwork Orange 1971 8.2
North by Northwest 104 North by Northwest 1959 8.2
Snatch 105 Snatch 2000 8.2
Scarface 106 Scarface 1983 8.2
Taxi Driver 107 Taxi Driver 1976 8.2
Ikiru 108 Ikiru 1952 8.2
Lawrence of Arabia 109 Lawrence of Arabia 1962 8.2
Amélie 110 Amélie 2001 8.2
Toy Story 3 111 Toy Story 3 2010 8.2
The Sting 112 The Sting 1973 8.2
Metropolis 113 Metropolis 1927 8.2
A Separation 114 A Separation 2011 8.2
Incendies 115 Incendies 2010 8.2
For a Few Dollars More 116 For a Few Dollars More 1965 8.2
Come and See 117 Come and See 1985 8.2
The Apartment 118 The Apartment 1960 8.2
Double Indemnity 119 Double Indemnity 1944 8.2
To Kill a Mockingbird 120 To Kill a Mockingbird 1962 8.2
Up 121 Up 2009 8.2
Indiana Jones and the Last Crusade 122 Indiana Jones and the Last Crusade 1989 8.2
L.A. Confidential 123 L.A. Confidential 1997 8.2
Heat 124 Heat 1995 8.2
Die Hard 125 Die Hard 1988 8.2
Monty Python and the Holy Grail 126 Monty Python and the Holy Grail 1975 8.2
Rashômon 127 Rashômon 1950 8.2
Yojimbo 128 Yojimbo 1961 8.2
Batman Begins 129 Batman Begins 2005 8.2
Green Book 130 Green Book 2018 8.2
Downfall 131 Downfall 2004 8.2
Children of Heaven 132 Children of Heaven 1997 8.2
Unforgiven 133 Unforgiven 1992 8.2
Ran 134 Ran 1985 8.2
Some Like It Hot 135 Some Like It Hot 1959 8.2
Howl's Moving Castle 136 Howl’s Moving Castle 2004 8.2
A Beautiful Mind 137 A Beautiful Mind 2001 8.2
All About Eve 138 All About Eve 1950 8.2
Casino 139 Casino 1995 8.2
The Great Escape 140 The Great Escape 1963 8.2
The Wolf of Wall Street 141 The Wolf of Wall Street 2013 8.2
Pan's Labyrinth 142 Pan’s Labyrinth 2006 8.2
Anand 143 Anand 1971 8.2
The Secret in Their Eyes 144 The Secret in Their Eyes 2009 8.1
Lock, Stock and Two Smoking Barrels 145 Lock, Stock and Two Smoking Barrels 1998 8.1
Raging Bull 146 Raging Bull 1980 8.1
My Neighbor Totoro 147 My Neighbor Totoro 1988 8.1
There Will Be Blood 148 There Will Be Blood 2007 8.1
Judgment at Nuremberg 149 Judgment at Nuremberg 1961 8.1
The Treasure of the Sierra Madre 150 The Treasure of the Sierra Madre 1948 8.1
Three Billboards Outside Ebbing, Missouri 151 Three Billboards Outside Ebbing, Missouri 2017 8.1
Dial M for Murder 152 Dial M for Murder 1954 8.1
Chinatown 153 Chinatown 1974 8.1
The Gold Rush 154 The Gold Rush 1925 8.1
Babam ve Oglum 155 Babam ve Oglum 2005 8.1
Shutter Island 156 Shutter Island 2010 8.1
No Country for Old Men 157 No Country for Old Men 2007 8.1
V for Vendetta 158 V for Vendetta 2005 8.1
The Seventh Seal 159 The Seventh Seal 1957 8.1
Inside Out 160 Inside Out 2015 8.1
Warrior 161 Warrior 2011 8.1
Vikram Vedha 162 Vikram Vedha 2017 8.1
The Elephant Man 163 The Elephant Man 1980 8.1
The Thing 164 The Thing 1982 8.1
The Sixth Sense 165 The Sixth Sense 1999 8.1
Trainspotting 166 Trainspotting 1996 8.1
Jurassic Park 167 Jurassic Park 1993 8.1
Gone with the Wind 168 Gone with the Wind 1939 8.1
The Truman Show 169 The Truman Show 1998 8.1
Wild Strawberries 170 Wild Strawberries 1957 8.1
Finding Nemo 171 Finding Nemo 2003 8.1
Blade Runner 172 Blade Runner 1982 8.1
Stalker 173 Stalker 1979 8.1
Kill Bill: Vol. 1 174 Kill Bill: Vol. 1 2003 8.1
Room 175 Room 2015 8.1
The Bridge on the River Kwai 176 The Bridge on the River Kwai 1957 8.1
Fargo 177 Fargo 1996 8.1
Memories of Murder 178 Memories of Murder 2003 8.1
Tokyo Story 179 Tokyo Story 1953 8.1
The Third Man 180 The Third Man 1949 8.1
Gran Torino 181 Gran Torino 2008 8.1
On the Waterfront 182 On the Waterfront 1954 8.1
Wild Tales 183 Wild Tales 2014 8.1
The Deer Hunter 184 The Deer Hunter 1978 8.1
Klaus 185 Klaus 2019 8.1
In the Name of the Father 186 In the Name of the Father 1993 8.1
Mary and Max 187 Mary and Max 2009 8.1
Gone Girl 188 Gone Girl 2014 8.1
The Grand Budapest Hotel 189 The Grand Budapest Hotel 2014 8.1
Hacksaw Ridge 190 Hacksaw Ridge 2016 8.1
Andhadhun 191 Andhadhun 2018 8.1
Before Sunrise 192 Before Sunrise 1995 8.1
Catch Me If You Can 193 Catch Me If You Can 2002 8.1
The Big Lebowski 194 The Big Lebowski 1998 8.1
Persona 195 Persona 1966 8.1
To Be or Not to Be 196 To Be or Not to Be 1942 8.1
Prisoners 197 Prisoners 2013 8.1
The Bandit 198 The Bandit 1996 8.1
Sherlock Jr. 199 Sherlock Jr. 1924 8.1
The General 200 The General 1926 8.1
How to Train Your Dragon 201 How to Train Your Dragon 2010 8.1
Ford v Ferrari 202 Ford v Ferrari 2019 8.1
Mr. Smith Goes to Washington 203 Mr. Smith Goes to Washington 1939 8.1
12 Years a Slave 204 12 Years a Slave 2013 8.1
Barry Lyndon 205 Barry Lyndon 1975 8.1
Mad Max: Fury Road 206 Mad Max: Fury Road 2015 8.1
Million Dollar Baby 207 Million Dollar Baby 2004 8.1
Stand by Me 208 Stand by Me 1986 8.1
Network 209 Network 1976 8.1
Cool Hand Luke 210 Cool Hand Luke 1967 8.1
Dead Poets Society 211 Dead Poets Society 1989 8.1
Ben-Hur 212 Ben-Hur 1959 8.1
Hachi: A Dog's Tale 213 Hachi: A Dog’s Tale 2009 8.1
Harry Potter and the Deathly Hallows: Part 2 214 Harry Potter and the Deathly Hallows: Part 2 2011 8.1
Platoon 215 Platoon 1986 8.1
Into the Wild 216 Into the Wild 2007 8.1
Logan 217 Logan 2017 8.1
The Wages of Fear 218 The Wages of Fear 1953 8.0
Monty Python's Life of Brian 219 Monty Python’s Life of Brian 1979 8.0
Rush 220 Rush 2013 8.0
The Handmaiden 221 The Handmaiden 2016 8.0
The Passion of Joan of Arc 222 The Passion of Joan of Arc 1928 8.0
The 400 Blows 223 The 400 Blows 1959 8.0
Andrei Rublev 224 Andrei Rublev 1966 8.0
Hotel Rwanda 225 Hotel Rwanda 2004 8.0
Spotlight 226 Spotlight 2015 8.0
Amores Perros 227 Amores Perros 2000 8.0
Rififi 228 Rififi 1955 8.0
La Haine 229 La Haine 1995 8.0
Nausicaä of the Valley of the Wind 230 Nausicaä of the Valley of the Wind 1984 8.0
Rocky 231 Rocky 1976 8.0
Gangs of Wasseypur 232 Gangs of Wasseypur 2012 8.0
Monsters, Inc. 233 Monsters, Inc. 2001 8.0
Rebecca 234 Rebecca 1940 8.0
Rang De Basanti 235 Rang De Basanti 2006 8.0
Before Sunset 236 Before Sunset 2004 8.0
Portrait of a Lady on Fire 237 Portrait of a Lady on Fire 2019 8.0
In the Mood for Love 238 In the Mood for Love 2000 8.0
Paris, Texas 239 Paris, Texas 1984 8.0
It Happened One Night 240 It Happened One Night 1934 8.0
Drishyam 241 Drishyam 2015 8.0
The Invisible Guest 242 The Invisible Guest 2016 8.0
The Help 243 The Help 2011 8.0
The Princess Bride 244 The Princess Bride 1987 8.0
The Circus 245 The Circus 1928 8.0
The Battle of Algiers 246 The Battle of Algiers 1966 8.0
The Terminator 247 The Terminator 1984 8.0
A Silent Voice: The Movie 248 A Silent Voice: The Movie 2016 8.0
Aladdin 249 Aladdin 1992 8.0
Tangerines 250 Tangerines 2013 8.0

(III) Detailed List

  • Read and load each line of source code of all the 250 movies.
    • Add Content Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross by reading each movie’s link.
  • The data was collected on 2020-10-30.
# get source code of a single movie
h_get_movie_source_code = function(curr_movie_link) {
  curr_movie_source_code = curr_movie_link %>%
    readLines(encoding = "UTF-8")
  return(curr_movie_source_code)
}
# get the actual release date of the movie
get_release_date_from_movie_source_code = function(movie_source_code) {
  release_date_pattern = "Release Date:"
  
  release_date_line = movie_source_code %>%
    grep(pattern = release_date_pattern)
  release_date = movie_source_code %>%
    extract(release_date_line) %>%
    str_split("h4> ") %>% sapply(extract, 2)
  return(release_date)
}
get_poster_from_movie_source_code = function(movie_source_code, output_width) {
  poster_start_pattern = "<div class=\"poster\">"
  poster_end_pattern = "</a>    </div>"
  
  poster_start_line = movie_source_code %>%
    grep(pattern = poster_start_pattern)
  lines_with_poster_end_pattern = movie_source_code %>%
    grep(pattern = poster_end_pattern)
  poster_end_line = lines_with_poster_end_pattern %>%
    extract(lines_with_poster_end_pattern %>%
              is_greater_than(poster_start_line) %>%
              which() %>%
              extract(1))
  
  poster = movie_source_code %>%
    extract(poster_start_line : poster_end_line) %>%
    extract(3 : 4) %>%
    paste(collapse = " ") %>%
    str_split("> ") %>% sapply(extract, 2) %>%
    str_split(" /") %>% sapply(extract, 1) %>%
    paste0(" width=\"", output_width, "\">")
  return(poster)
}
p = c()
for (i in 1:3) {
  curr_movie_sc = h_get_movie_source_code(m_link[i])
  curr_poster = get_poster_from_movie_source_code(curr_movie_sc, 75)
  p = p %>% c(curr_poster)
}
p_df = tibble(p)

p_df %>%
  kable(align = "c", escape = FALSE) %>%
  kable_styling(bootstrap_options = c("striped",
                                      "hover",
                                      "responsive"),
                fixed_thead = TRUE,
                full_width = FALSE) %>%
  row_spec(0:3, extra_css = "vertical-align: middle;")
p
The Shawshank Redemption Poster
The Godfather Poster
The Godfather: Part II Poster
rm(p, i, p_df)
# get basic info json from the single movie source code
h_get_basics_from_movie_source_code = function(movie_source_code) {
  json_start_pattern = "<script type=\"application/ld\\+json\">\\{"
  json_end_pattern = "\\}</script>"
  
  json_start_line = movie_source_code %>%
    grep(pattern = json_start_pattern)
  json_end_line = movie_source_code %>%
    grep(pattern = json_end_pattern) %>%
    extract(1)
  
  json_file = movie_source_code %>%
    extract(json_start_line : json_end_line)
  return(json_file)
}
h_existence_checking = function(basics, curr_pattern) {
  existence_check = basics %>%
    extract(basics %>% grep(pattern = curr_pattern)) %>%
    length() %>%
    is_greater_than(0)
  return(existence_check)
}

h_get_genre = function(basics) {
  genre_pattern = "genre"
  
  existence_check = h_existence_checking(basics, genre_pattern)
  if (existence_check %>% equals(FALSE)) {
    return("")
  }
  
  genre = basics %>%
    extract(basics %>% grep(pattern = genre_pattern)) %>%
    str_split(": \"") %>% sapply(extract, 2) %>%
    str_split("\"") %>% sapply(extract, 1)
  return(genre)
}

h_get_content_rating = function(basics) {
  content_rating_pattern = "contentRating"
  
  existence_check = h_existence_checking(basics, content_rating_pattern)
  if (existence_check %>% equals(FALSE)) {
    return("")
  }
  
  content_rating = basics %>%
    extract(basics %>% grep(pattern = content_rating_pattern)) %>%
    str_split(": \"") %>% sapply(extract, 2) %>%
    str_split("\"") %>% sapply(extract, 1)
  return(content_rating)
}

h_get_rating_count = function(basics) {
  rating_count_pattern = "ratingCount"
  
  existence_check = h_existence_checking(basics, rating_count_pattern)
  if (existence_check %>% equals(FALSE)) {
    return("")
  }
  
  rating_count = basics %>%
    extract(basics %>% grep(pattern = rating_count_pattern)) %>%
    str_split(": ") %>% sapply(extract, 2) %>%
    str_split(",") %>% sapply(extract, 1)
  return(rating_count)
}

h_get_rating_value = function(basics) {
  rating_value_pattern = "ratingValue"
  
  existence_check = h_existence_checking(basics, rating_value_pattern)
  if (existence_check %>% equals(FALSE)) {
    return("")
  }
  
  rating_value = basics %>%
    extract(basics %>% grep(pattern = rating_value_pattern)) %>%
    extract(1) %>%
    str_split(": \"") %>% sapply(extract, 2) %>%
    str_split("\"") %>% sapply(extract, 1)
  return(rating_value)
}

h_get_date_published = function(basics) {
  date_published_pattern = "datePublished"
  
  existence_check = h_existence_checking(basics, date_published_pattern)
  if (existence_check %>% equals(FALSE)) {
    return("")
  }
  
  date_published = basics %>%
    extract(basics %>% grep(pattern = date_published_pattern)) %>%
    str_split(": \"") %>% sapply(extract, 2) %>%
    str_split("\"") %>% sapply(extract, 1)
  return(date_published)
}

h_get_basics_info = function(basics) {
  curr_genre = h_get_genre(basics)
  curr_content_rating = h_get_content_rating(basics)
  curr_rating_count = h_get_rating_count(basics)
  curr_rating_value = h_get_rating_value(basics)
  curr_date_published = h_get_date_published(basics)

  return(c(curr_genre,
           curr_content_rating,
           curr_rating_count,
           curr_rating_value,
           curr_date_published))
}
# get box office info from the single movie source code
h_get_box_office_from_movie_source_code = function(movie_source_code) {
  box_office_start_pattern = "<h3 class=\"subheading\">Box Office</h3>"
  box_office_end_pattern = "<hr />"
  
  box_office_start_line = movie_source_code %>%
    grep(pattern = box_office_start_pattern)
  
  lines_with_box_office_end_pattern = movie_source_code %>%
    grep(pattern = box_office_end_pattern)
  box_office_end_line = lines_with_box_office_end_pattern %>%
    extract(lines_with_box_office_end_pattern %>%
              is_greater_than(box_office_start_line) %>%
              which() %>%
              extract(1))
  
  box_office = movie_source_code %>%
    extract(box_office_start_line : box_office_end_line)
  return(box_office)
}
curr_source_code = m_link[1] %>%
  h_get_movie_source_code()

curr_basics = curr_source_code %>%
  h_get_basics_from_movie_source_code()
curr_box_office = curr_source_code %>%
  h_get_box_office_from_movie_source_code()

curr_release_date = curr_source_code %>%
  get_release_date_from_movie_source_code()
curr_basics_info = h_get_basics_info(curr_basics)

curr_release_date
[1] "14 October 1994 (USA)"
curr_basics_info
[1] "Drama"      "R"          "2299252"    "9.3"        "1994-09-23"
curr_basics %>% cat()
<script type="application/ld+json">{   "@context": "http://schema.org",   "@type": "Movie",   "url": "/title/tt0111161/",   "name": "The Shawshank Redemption",   "image": "https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_.jpg",   "genre": "Drama",   "contentRating": "R",   "actor": [     {       "@type": "Person",       "url": "/name/nm0000209/",       "name": "Tim Robbins"     },     {       "@type": "Person",       "url": "/name/nm0000151/",       "name": "Morgan Freeman"     },     {       "@type": "Person",       "url": "/name/nm0348409/",       "name": "Bob Gunton"     },     {       "@type": "Person",       "url": "/name/nm0006669/",       "name": "William Sadler"     }   ],   "director": {     "@type": "Person",     "url": "/name/nm0001104/",     "name": "Frank Darabont"   },   "creator": [     {       "@type": "Person",       "url": "/name/nm0000175/",       "name": "Stephen King"     },     {       "@type": "Person",       "url": "/name/nm0001104/",       "name": "Frank Darabont"     },     {       "@type": "Organization",       "url": "/company/co0040620/"     }   ],   "description": "The Shawshank Redemption is a movie starring Tim Robbins, Morgan Freeman, and Bob Gunton. Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.",   "datePublished": "1994-09-23",   "keywords": "wrongful imprisonment,based on the works of stephen king,prison,escape from prison,voice over narration",   "aggregateRating": {     "@type": "AggregateRating",     "ratingCount": 2299252,     "bestRating": "10.0",     "worstRating": "1.0",     "ratingValue": "9.3"   },   "review": {     "@type": "Review",     "itemReviewed": {       "@type": "CreativeWork",       "url": "/title/tt0111161/"     },     "author": {       "@type": "Person",       "name": "alexkolokotronis"     },     "dateCreated": "2008-02-18",     "inLanguage": "English",     "name": "This is How Movies Should Be Made",     "reviewBody": "This movie is not your ordinary Hollywood flick. It has a great and deep message. This movie has a foundation and just kept on being built on from their and that foundation is hope.\n\nOther than just the message of this movie the acting was phenomenal. Tim Robbins gave one of the greatest performances ever. He was inspiring, intelligent and most of all positive. His performance just made me smile. Robbins plays Andy Dufresne who was wrongfully convicted of murdering his wife and her lover. He is gets to life sentences but yet never gives up hope. In he becomes friends with Ellis Boyd \"Red\" Redding played by Morgan Freeman. Freeman who gives the finest performance of his career has unlike Robbins lost hope. He is in deep regret of the crime that he committed. His way of deflecting the pain away is by trying to not feel anything at all. With his friendship with Andy he learns that without our hopes and dreams we have nothing. Andy also becomes friends with the rest of Red\u0027s group. James Whitmore also gave a great performance as Brooks Halten who gets out of prison parole but in the words of Red he has been \"institutionalized\". \n\nThe directing by Frank Darabont was just magnificent. He kept this movie at a great steady pace along with the writing and great cinematography. He portrayed prison life in such a horrifying way, but not in terms of the physical pain but the stress and pain that wares mentally on the inmates, some of which deserve a second chance. \n\nWhatever you do, don\u0027t listen to the people who say this movie is overrated because this is one of the most inspiring and greatest movies ever. It has everything you could possibly want.",     "reviewRating": {       "@type": "Rating",       "worstRating": "1",       "bestRating": "10",       "ratingValue": "10"     }   },   "duration": "PT2H22M",   "trailer": {     "@type": "VideoObject",     "name": "Official Trailer",     "embedUrl": "/video/imdb/vi3877612057",     "thumbnail": {       "@type": "ImageObject",       "contentUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg"     },     "thumbnailUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg",     "description": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.",     "uploadDate": "2014-03-05T14:13:19Z"   } }</script>
curr_box_office %>% cat()
    <h3 class="subheading">Box Office</h3>          <div class="txt-block">             <h4 class="inline">Budget:</h4>$25,000,000             <span class="attribute">(estimated)</span>         </div>          <div class="txt-block">             <h4 class="inline">Opening Weekend USA:</h4> $727,327, <span class="attribute">25 September 1994</span>        </div>          <div class="txt-block"> <h4 class="inline">Gross USA:</h4> $28,699,976        </div>          <div class="txt-block"> <h4 class="inline">Cumulative Worldwide Gross:</h4> $28,815,291        </div>      <span class="see-more inline">         <a href="https://pro.imdb.com/title/tt0111161?rf=cons_tt_bo_tt&ref_=cons_tt_bo_tt" >See more on IMDbPro</a>&nbsp;&raquo;     </span>   <hr />
Target Regular Expression
Title h1 itemprop="name"
Year Next line of Title
Content Rating meta itemprop="contentRating"
User Rating span itemprop="ratingValue"
Number of Rater itemprop="ratingCount"
Genre span class="itemprop" itemprop="genre"
Budget <h4 class="inline">Budget
Opening Weekend USA ($) <h4 class="inline">Opening Weekend USA
Gross USA ($) <h4 class="inline">Gross
Cumulative Worldwide Gross ($) <h4 class="inline">Cumulative
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
  temp=readLines(con=input,encoding="UTF-8")
  
  # 1. title
  temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
  temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
  temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
  
  #2. year
  temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
  temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
  temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
  
  #3. content rating
  temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
  if (length(temp.movie_content_rating)==1){
    temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
  }
  if (length(temp.movie_content_rating)==0){
    temp.movie_content_rating="-"
  }
  
  #4. user rating
  temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
  temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
  temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
  
  #5. number of rater
  temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
  temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
  temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
  
  #6. genre
  temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
  temp.movie_genre.l=length(temp.movie_genre)
  for (i in 1:temp.movie_genre.l){
    temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
    temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
  }
  remove(i,temp.movie_genre.l)
  temp.movie_genre=paste(temp.movie_genre,collapse=", ")
  
  #7. budget
  temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
  if (length(temp.movie_budget)==1){
    temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
    a=strsplit(temp.movie_budget,split="")[[1]]
    if (paste(a[1],a[2],a[3],sep="")=="FRF"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="JPY"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="INR"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="DEM"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="RUR"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="TRL"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="AUD"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],sep="")=="KRW"){
      temp.movie_budget=paste(strsplit(temp.movie_budget,split="&nbsp;")[[1]][1],strsplit(temp.movie_budget,split="&nbsp;")[[1]][2],sep=" ")
    }
    if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="&euro;"){
      temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
    }
    if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="&pound;"){
      temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
    }
    remove(a)
  }
  if (length(temp.movie_budget)==0){
    temp.movie_budget="-"
  }
  
  #8. opening
  temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
  if (length(temp.movie_opening)==1){
    temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
    temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
    a=strsplit(temp.movie_opening,split="")[[1]]
    if (a[length(a)]==","){
      temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
    }
    remove(a)
  }
  if (length(temp.movie_opening)==0){
    temp.movie_opening="-"
  }
  
  #9. gross
  temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
  if (length(temp.movie_gross)==1){
    temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
    temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
    a=strsplit(temp.movie_gross,split="")[[1]]
    if (a[length(a)]==","){
      temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
    }
    remove(a)
  }
  if (length(temp.movie_gross)==0){
    temp.movie_gross="-"
  }
  
  #10. worldwide gross
  temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
  if (length(temp.movie_worldwide_gross)==1){
    temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
    temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
    a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
    if (a[length(a)]==","){
      temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
    }
    remove(a)
  }
  if (length(temp.movie_worldwide_gross)==0){
    temp.movie_worldwide_gross="-"
  }
  
  #11. result
  return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}

#Collecting data
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
  temp.target.info=get.target.info(movie_link[i])
  movie_title=c(movie_title,temp.target.info[1])
  movie_year=c(movie_year,temp.target.info[2])
  movie_content_rating=c(movie_content_rating,temp.target.info[3])
  movie_user_rating=c(movie_user_rating,temp.target.info[4])
  movie_num_rater=c(movie_num_rater,temp.target.info[5])
  movie_genre=c(movie_genre,temp.target.info[6])
  movie_budget=c(movie_budget,temp.target.info[7])
  movie_opening=c(movie_opening,temp.target.info[8])
  movie_gross=c(movie_gross,temp.target.info[9])
  movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}

#Visualization
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))